Load the cleaned data from
data_preparation.rmd. Assumes
koi_data.Rda contains the necessary columns.
Define target and predictor columns. Review this list carefully.
target_variable_name <- "koi_pdisposition" # Or 'koi_disposition'
predictor_cols <- c(
# Numeric Predictors (Examples)
"koi_period", "koi_duration", "koi_depth", "koi_prad", "koi_teq",
"koi_insol", "koi_model_snr", "koi_steff", "koi_slogg", "koi_srad",
"koi_smass", "koi_impact", "koi_ror", "koi_srho", "koi_sma", "koi_incl",
"koi_dor", "koi_ldm_coeff1", "koi_ldm_coeff2", "koi_smet"
)
selected_cols <- c(target_variable_name, predictor_cols)
# Check which selected columns actually exist in the loaded data
selected_cols_exist <- selected_cols[selected_cols %in% names(koi_data_raw)]
missing_cols <- setdiff(selected_cols, selected_cols_exist)
if (length(missing_cols) > 0) {
print(paste("Warning: The following selected columns were not found:", paste(missing_cols, collapse = ", ")))
}
if (!target_variable_name %in% selected_cols_exist) {
stop(paste("Target variable", target_variable_name, "not found in the data!"))
}
# Subset the data
koi_data <- koi_data_raw[, selected_cols_exist]
print(paste("Selected", ncol(koi_data), "columns for analysis."))## [1] "Selected 21 columns for analysis."
Stratified split based on the target variable.
set.seed(42) # for reproducibility
train_indices <- createDataPartition(koi_data[[target_variable_name]], p = 0.8, list = FALSE)
train_data <- koi_data[train_indices, ]
test_data <- koi_data[-train_indices, ]
print(paste("Training set size:", nrow(train_data)))## [1] "Training set size: 6399"
## [1] "Test set size: 1599"
Apply simple median/mode imputation. Consider more advanced methods if NAs are numerous or potentially non-random.
# --- Impute Training Data ---
numeric_predictors_train <- names(train_data)[sapply(train_data, is.numeric)]
factor_predictors_train <- names(train_data)[sapply(train_data, function(x) is.character(x) || is.factor(x))]
factor_predictors_train <- setdiff(factor_predictors_train, target_variable_name)
# Impute numeric with median (calculated from training data)
train_medians <- list()
for (col in numeric_predictors_train) {
if (any(is.na(train_data[[col]]))) {
median_val <- median(train_data[[col]], na.rm = TRUE)
if (is.na(median_val)) {
median_val <- 0
print(paste("Warning: Train median NA for", col))
}
train_data[[col]][is.na(train_data[[col]])] <- median_val
train_medians[[col]] <- median_val # Store median for applying to test set
}
}
# Impute character/factor with mode (calculated from training data)
train_modes <- list()
for (col in factor_predictors_train) {
if (any(is.na(train_data[[col]]))) {
mode_val <- calculate_mode(train_data[[col]], na.rm = TRUE)
if (is.na(mode_val)) {
mode_val <- "Missing"
print(paste("Warning: Train mode NA for", col))
}
train_data[[col]][is.na(train_data[[col]])] <- mode_val
train_modes[[col]] <- mode_val # Store mode for applying to test set
}
}
# Remove rows with NA in target (TRAIN)
rows_before_na_target_train <- nrow(train_data)
train_data <- train_data %>% filter(!is.na(.data[[target_variable_name]]))
if (nrow(train_data) < rows_before_na_target_train) print("Removed rows with NA target in TRAIN")
# --- Impute Test Data (using values from training data) ---
for (col in names(train_medians)) {
if (col %in% names(test_data) && any(is.na(test_data[[col]]))) {
test_data[[col]][is.na(test_data[[col]])] <- train_medians[[col]]
# print(paste("Imputed NAs in TEST:", col))
}
}
# Impute character/factor with *training* mode
for (col in names(train_modes)) {
if (col %in% names(test_data) && any(is.na(test_data[[col]]))) {
test_data[[col]][is.na(test_data[[col]])] <- train_modes[[col]]
# print(paste("Imputed NAs in TEST:", col))
}
}
# Handle any remaining NAs in test set predictors if medians/modes couldn't be calculated or applied
na_check_test <- colSums(is.na(test_data %>% select(-all_of(target_variable_name))))
if (any(na_check_test > 0)) {
print("Warning: NAs still present in test set predictors after imputation:")
print(na_check_test[na_check_test > 0])
# Consider removing these test rows or using a more robust imputation
test_data <- test_data[complete.cases(test_data %>% select(-all_of(target_variable_name))), ]
print("Removed test rows with remaining NAs in predictors.")
}
# Remove rows with NA in target (TEST)
rows_before_na_target_test <- nrow(test_data)
test_data <- test_data %>% filter(!is.na(.data[[target_variable_name]]))
if (nrow(test_data) < rows_before_na_target_test) print("Removed rows with NA target in TEST")Ensure categorical predictors and the target variable are factors with consistent levels.
factor_cols_to_convert <- c(
"koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co", "koi_fpflag_ec" # Flags
)
factor_cols_exist_train <- factor_cols_to_convert[factor_cols_to_convert %in% names(train_data)]
# Convert predictor factors in training data
if (length(factor_cols_exist_train) > 0) {
train_data <- train_data %>%
mutate(across(all_of(factor_cols_exist_train), as.factor))
print(paste("Converted predictor(s) to factor in train data:", paste(factor_cols_exist_train, collapse = ", ")))
}
# Convert predictor factors in test data (using levels from training data if possible)
factor_cols_exist_test <- factor_cols_to_convert[factor_cols_to_convert %in% names(test_data)]
if (length(factor_cols_exist_test) > 0) {
for (col in factor_cols_exist_test) {
# Ensure levels in test match train, adding if necessary
train_levels <- levels(train_data[[col]])
test_data[[col]] <- factor(test_data[[col]], levels = train_levels)
}
print(paste("Converted predictor(s) to factor in test data:", paste(factor_cols_exist_test, collapse = ", ")))
}
# Convert target variable to factor with desired levels and labels
train_data[[target_variable_name]] <- factor(train_data[[target_variable_name]],
levels = c("CANDIDATE", "FALSE POSITIVE"),
labels = c("candidate", "false_positive")
)
test_data[[target_variable_name]] <- factor(test_data[[target_variable_name]],
levels = c("CANDIDATE", "FALSE POSITIVE"),
labels = c("candidate", "false_positive")
)
# Define positive class based on the new factor levels
positive_class <- levels(train_data[[target_variable_name]])[2]
if (is.na(positive_class)) stop("Could not determine positive class level after factoring.")
print(paste("Positive class for evaluation:", positive_class))## [1] "Positive class for evaluation: false_positive"
Scale numeric predictors AFTER splitting and handling NAs/factors. Fit scaler only on training data.
numeric_predictors_final <- names(train_data)[sapply(train_data, is.numeric)]
# Check for zero variance columns before scaling
nzv <- nearZeroVar(train_data[, numeric_predictors_final], saveMetrics = TRUE)
cols_to_scale <- rownames(nzv)[!nzv$zeroVar]
if (length(cols_to_scale) < length(numeric_predictors_final)) {
print("Warning: Found zero-variance numeric columns, excluding from scaling:")
print(rownames(nzv)[nzv$zeroVar])
}
if (length(cols_to_scale) > 0) {
scaler <- preProcess(train_data[, cols_to_scale], method = c("center", "scale"))
train_data_scaled <- predict(scaler, train_data)
test_data_scaled <- predict(scaler, test_data)
print(paste("Scaled numeric predictors:", paste(cols_to_scale, collapse = ", ")))
} else {
print("No non-zero variance numeric predictors found to scale.")
train_data_scaled <- train_data
test_data_scaled <- test_data
}## [1] "Scaled numeric predictors: koi_period, koi_duration, koi_depth, koi_prad, koi_teq, koi_insol, koi_model_snr, koi_steff, koi_slogg, koi_srad, koi_smass, koi_impact, koi_ror, koi_srho, koi_sma, koi_incl, koi_dor, koi_ldm_coeff1, koi_ldm_coeff2, koi_smet"
Define the formula and fit the GLM.
glm_original_formula <- as.formula(paste(target_variable_name, "~ ."))
print(paste("Using formula:", deparse(glm_original_formula)))## [1] "Using formula: koi_pdisposition ~ ."
glm_original_model <- glm(glm_original_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_original_model)##
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.09986 0.09556 11.510 < 2e-16 ***
## koi_period 0.25772 0.21190 1.216 0.22390
## koi_duration 1.15093 0.09986 11.526 < 2e-16 ***
## koi_depth 2.19920 0.40141 5.479 4.28e-08 ***
## koi_prad 0.25948 0.38662 0.671 0.50212
## koi_teq 1.82665 0.11695 15.619 < 2e-16 ***
## koi_insol -0.44363 0.05698 -7.786 6.90e-15 ***
## koi_model_snr -0.04758 0.08904 -0.534 0.59313
## koi_steff -0.48874 0.15495 -3.154 0.00161 **
## koi_slogg 0.67690 0.08798 7.694 1.43e-14 ***
## koi_srad 0.01393 0.07724 0.180 0.85685
## koi_smass 0.44922 0.09916 4.530 5.89e-06 ***
## koi_impact 0.53968 0.10365 5.207 1.92e-07 ***
## koi_ror 2.55320 0.49385 5.170 2.34e-07 ***
## koi_srho 0.02435 0.03015 0.808 0.41923
## koi_sma -0.29034 0.24926 -1.165 0.24411
## koi_incl -0.41634 0.09885 -4.212 2.53e-05 ***
## koi_dor 0.58369 0.08070 7.233 4.74e-13 ***
## koi_ldm_coeff1 -1.05599 0.46039 -2.294 0.02181 *
## koi_ldm_coeff2 -0.98381 0.36463 -2.698 0.00697 **
## koi_smet -0.56487 0.05652 -9.995 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8870.4 on 6398 degrees of freedom
## Residual deviance: 5199.5 on 6378 degrees of freedom
## AIC: 5241.5
##
## Number of Fisher Scoring iterations: 9
Predict on the test set and evaluate.
original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)
# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] # Should be 'candidate'
# Note: positive_class was defined earlier based on train_data levels
original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
levels = test_target_levels
)
test_target_factor <- test_data_scaled[[target_variable_name]]
cm <- confusionMatrix(original_preds,
test_target_factor,
positive = positive_class
) # Ensure positive class is correctly specified
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 708 198
## false_positive 98 595
##
## Accuracy : 0.8149
## 95% CI : (0.795, 0.8336)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6294
##
## Mcnemar's Test P-Value : 8.702e-09
##
## Sensitivity : 0.7503
## Specificity : 0.8784
## Pos Pred Value : 0.8586
## Neg Pred Value : 0.7815
## Prevalence : 0.4959
## Detection Rate : 0.3721
## Detection Prevalence : 0.4334
## Balanced Accuracy : 0.8144
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_original <- roc(
response = test_target_factor,
predictor = original_probs,
levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))## [1] "GLM Original Features - AUC: 0.8882"
Plot ROC curve
plot(roc_original,
main = "ROC Curve - GLM Original Features",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Check for influential points using cook’s distance.
Find and remove all influential points with Cook’s distance > 0.5
cooks_d <- cooks.distance(glm_original_model)
outlier_indices <- which(cooks_d > 0.5)
print(paste("Number of influential points to remove:", length(outlier_indices)))## [1] "Number of influential points to remove: 2"
# Remove all influential points
if (length(outlier_indices) > 0) {
train_data_scaled <- train_data_scaled[-outlier_indices, ]
print(paste("Removed", length(outlier_indices), "influential points"))
}## [1] "Removed 2 influential points"
Refit the GLM with the updated training data.
glm_original_model <- glm(glm_original_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_original_model)##
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.14156 0.08507 13.419 < 2e-16 ***
## koi_period 0.29864 0.21669 1.378 0.16814
## koi_duration 1.29810 0.10383 12.502 < 2e-16 ***
## koi_depth 0.77960 0.34790 2.241 0.02503 *
## koi_prad 0.01257 0.24641 0.051 0.95933
## koi_teq 1.99661 0.12031 16.595 < 2e-16 ***
## koi_insol -0.83265 0.09432 -8.828 < 2e-16 ***
## koi_model_snr -0.08312 0.08217 -1.012 0.31171
## koi_steff -0.51090 0.15898 -3.214 0.00131 **
## koi_slogg 0.75842 0.08950 8.474 < 2e-16 ***
## koi_srad 0.08568 0.07305 1.173 0.24087
## koi_smass 0.51646 0.10212 5.057 4.25e-07 ***
## koi_impact 0.40769 0.10601 3.846 0.00012 ***
## koi_ror 5.73441 0.50313 11.398 < 2e-16 ***
## koi_srho 0.02551 0.03030 0.842 0.39986
## koi_sma -0.40431 0.25554 -1.582 0.11361
## koi_incl -0.25444 0.10093 -2.521 0.01170 *
## koi_dor 0.65097 0.08326 7.819 5.35e-15 ***
## koi_ldm_coeff1 -1.01362 0.47104 -2.152 0.03141 *
## koi_ldm_coeff2 -0.95850 0.37285 -2.571 0.01015 *
## koi_smet -0.57885 0.05791 -9.996 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8867.7 on 6396 degrees of freedom
## Residual deviance: 5051.9 on 6376 degrees of freedom
## AIC: 5093.9
##
## Number of Fisher Scoring iterations: 8
Check outliers again.
Find and remove all influential points with Cook’s distance > 0.5
cooks_d <- cooks.distance(glm_original_model)
outlier_indices <- which(cooks_d > 0.5)
print(paste("Number of influential points to remove:", length(outlier_indices)))## [1] "Number of influential points to remove: 1"
# Remove all influential points
if (length(outlier_indices) > 0) {
train_data_scaled <- train_data_scaled[-outlier_indices, ]
print(paste("Removed", length(outlier_indices), "influential points"))
}## [1] "Removed 1 influential points"
Refit the GLM with the updated training data.
glm_original_model <- glm(glm_original_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_original_model)##
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.13220 0.08512 13.301 < 2e-16 ***
## koi_period 0.27949 0.21694 1.288 0.19763
## koi_duration 1.30688 0.10405 12.561 < 2e-16 ***
## koi_depth 0.77501 0.34777 2.229 0.02585 *
## koi_prad 0.01662 0.26552 0.063 0.95008
## koi_teq 2.02544 0.12105 16.732 < 2e-16 ***
## koi_insol -1.15821 0.12615 -9.181 < 2e-16 ***
## koi_model_snr -0.08300 0.08220 -1.010 0.31262
## koi_steff -0.51294 0.15913 -3.223 0.00127 **
## koi_slogg 0.76720 0.08972 8.551 < 2e-16 ***
## koi_srad 0.09537 0.07306 1.305 0.19175
## koi_smass 0.51567 0.10259 5.026 5.00e-07 ***
## koi_impact 0.41457 0.10607 3.908 9.29e-05 ***
## koi_ror 5.74178 0.51005 11.257 < 2e-16 ***
## koi_srho 0.02530 0.03031 0.835 0.40399
## koi_sma -0.38305 0.25592 -1.497 0.13446
## koi_incl -0.24436 0.10075 -2.426 0.01529 *
## koi_dor 0.65519 0.08338 7.858 3.91e-15 ***
## koi_ldm_coeff1 -1.00526 0.47154 -2.132 0.03302 *
## koi_ldm_coeff2 -0.95198 0.37324 -2.551 0.01075 *
## koi_smet -0.58088 0.05801 -10.013 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8866.2 on 6395 degrees of freedom
## Residual deviance: 5045.5 on 6375 degrees of freedom
## AIC: 5087.5
##
## Number of Fisher Scoring iterations: 8
Check outliers again.
Now the model should be good.
Predict on the test set and evaluate.
original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)
# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] # Should be 'candidate'
# Note: positive_class was defined earlier based on train_data levels
original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
levels = test_target_levels
)
test_target_factor <- test_data_scaled[[target_variable_name]]
cm <- confusionMatrix(original_preds,
test_target_factor,
positive = positive_class
) # Ensure positive class is correctly specified
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 708 186
## false_positive 98 607
##
## Accuracy : 0.8224
## 95% CI : (0.8028, 0.8408)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6444
##
## Mcnemar's Test P-Value : 2.437e-07
##
## Sensitivity : 0.7654
## Specificity : 0.8784
## Pos Pred Value : 0.8610
## Neg Pred Value : 0.7919
## Prevalence : 0.4959
## Detection Rate : 0.3796
## Detection Prevalence : 0.4409
## Balanced Accuracy : 0.8219
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_original <- roc(
response = test_target_factor,
predictor = original_probs,
levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))## [1] "GLM Original Features - AUC: 0.8874"
Plot ROC curve
plot(roc_original,
main = "ROC Curve - GLM Original Features",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Plots Pearson residuals against each continuous predictor in the model.
## Test stat Pr(>|Test stat|)
## koi_period 3.5013 0.0613197 .
## koi_duration 2.4793 0.1153538
## koi_depth 8.2495 0.0040764 **
## koi_prad 0.0072 0.9325124
## koi_teq 67.6959 2.220e-16 ***
## koi_insol 4.6907 0.0303268 *
## koi_model_snr 0.9118 0.3396400
## koi_steff 0.0083 0.9276262
## koi_slogg 18.8307 1.428e-05 ***
## koi_srad 7.6578 0.0056527 **
## koi_smass 4.8253 0.0280447 *
## koi_impact 8.5979 0.0033655 **
## koi_ror 0.8858 0.3466207
## koi_srho 0.1856 0.6666376
## koi_sma 15.9164 6.620e-05 ***
## koi_incl 13.9564 0.0001871 ***
## koi_dor 9.1079 0.0025451 **
## koi_ldm_coeff1 0.0243 0.8761724
## koi_ldm_coeff2 0.0422 0.8371761
## koi_smet 32.9217 9.595e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Calculates Variance Inflation Factors. Important when using original predictors. High VIF (> 5 or > 10) suggests multicollinearity might be inflating standard errors of coefficients, making them unstable/unreliable.
## [1] "VIF Values:"
## koi_period koi_duration koi_depth koi_prad koi_teq
## 45.365541 3.146117 2.405060 2.099924 5.875834
## koi_insol koi_model_snr koi_steff koi_slogg koi_srad
## 1.658242 1.735790 15.228120 4.938804 2.532314
## koi_smass koi_impact koi_ror koi_srho koi_sma
## 5.751579 2.101254 2.885427 1.112261 63.188275
## koi_incl koi_dor koi_ldm_coeff1 koi_ldm_coeff2 koi_smet
## 2.014036 4.972557 173.500591 113.851021 2.123956
# Check for high values
max_vif <- max(vif_values, na.rm = TRUE)
print(paste("Max VIF:", round(max_vif, 2)))## [1] "Max VIF: 173.5"
From the VIF values, we notice the following: 1. Physical Relationships: koi_period and koi_sma are expected to be highly correlated due to Kepler’s Third Law. 2. Parameter Dependencies: Limb darkening coefficients (koi_ldm_coeff1, koi_ldm_coeff2) are often highly correlated with each other and can also be correlated with stellar temperature (koi_steff). Stellar parameters (koi_steff, koi_teq, koi_smass) are often correlated among themselves.
We proceed by removing highly correlated predictors from the model.
# Remove highly correlated predictors
predictors_to_remove <- c("koi_sma", "koi_ldm_coeff2")
train_data_scaled <- train_data_scaled[, !names(train_data_scaled) %in% predictors_to_remove]
test_data_scaled <- test_data_scaled[, !names(test_data_scaled) %in% predictors_to_remove]
# Update predictor_cols
predictor_cols <- predictor_cols[!predictor_cols %in% predictors_to_remove]
# Refit the model with updated data
glm_original_model <- glm(glm_original_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_original_model)##
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.12710 0.08484 13.286 < 2e-16 ***
## koi_period -0.02010 0.07338 -0.274 0.78415
## koi_duration 1.26967 0.10099 12.573 < 2e-16 ***
## koi_depth 0.78093 0.34724 2.249 0.02452 *
## koi_prad 0.01503 0.26629 0.056 0.95498
## koi_teq 2.12831 0.10022 21.237 < 2e-16 ***
## koi_insol -1.20622 0.12232 -9.861 < 2e-16 ***
## koi_model_snr -0.07166 0.08178 -0.876 0.38091
## koi_steff -0.17450 0.08311 -2.100 0.03576 *
## koi_slogg 0.79903 0.08851 9.028 < 2e-16 ***
## koi_srad 0.11529 0.07186 1.604 0.10861
## koi_smass 0.46510 0.09684 4.803 1.57e-06 ***
## koi_impact 0.41540 0.10568 3.931 8.47e-05 ***
## koi_ror 5.69350 0.50833 11.200 < 2e-16 ***
## koi_srho 0.02612 0.03047 0.857 0.39119
## koi_incl -0.24948 0.10070 -2.477 0.01323 *
## koi_dor 0.62047 0.07931 7.823 5.14e-15 ***
## koi_ldm_coeff1 0.18397 0.06020 3.056 0.00224 **
## koi_smet -0.66341 0.04795 -13.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8866.2 on 6395 degrees of freedom
## Residual deviance: 5054.1 on 6377 degrees of freedom
## AIC: 5092.1
##
## Number of Fisher Scoring iterations: 8
The logistic regression model was fitted using scaled original numeric features (excluding the binary flag variables) to predict the koi_pdisposition.
Model Fit & Significant Predictors: The overall model shows a highly significant improvement over the null model (intercept only), indicated by the large drop in deviance (Null: 8866.2, Residual: 5045.5). Several predictors showed a statistically significant relationship (p < 0.05) with the target variable in this model: koi_duration, koi_depth, koi_teq, koi_insol, koi_steff, koi_slogg, koi_smass, koi_impact, koi_ror, koi_incl, koi_dor, koi_ldm_coeff1, koi_ldm_coeff2, and koi_smet. Notable predictors that were not statistically significant in this specific model include koi_period, koi_prad, koi_model_snr, koi_srad, koi_srho, and koi_sma. This could be due to collinearity with other included variables.
Residual Analysis (Non-linearity Check): Tests for non-linearity (curvature) were performed on the model’s residuals. Significant evidence (p < 0.05) of non-linear patterns remaining in the residuals was found for the following predictors: koi_depth, koi_teq, koi_insol, koi_slogg, koi_srad, koi_smass, koi_impact, koi_sma, koi_incl, koi_dor, and koi_smet. koi_period was borderline significant. This indicates that the assumption of a purely linear relationship between these predictors and the log-odds of the outcome is likely violated.
Conclusion: The GLM identifies many significant predictors for exoplanet disposition. However, the residual analysis strongly suggests that the model’s linear structure is insufficient to capture the underlying relationships for numerous variables. This indicates that model performance could potentially be improved by incorporating non-linear effects or using models better suited to capturing such complexities.
Let’s start by fitting a GAM model in order to use smooth functions s() for predictors showing non-linearity in residual analysis.
nonlinear_predictors_gam <- c(
"koi_depth", "koi_teq", "koi_insol", "koi_slogg",
"koi_srad", "koi_smass", "koi_impact", "koi_sma",
"koi_incl", "koi_dor", "koi_smet", "koi_period"
)
nonlinear_predictors_gam <- nonlinear_predictors_gam[nonlinear_predictors_gam %in% names(train_data_scaled)]
linear_predictors_gam <- setdiff(predictor_cols, nonlinear_predictors_gam)Prepare the formula for the GAM model.
smooth_terms <- if (length(nonlinear_predictors_gam) > 0) paste(paste0("s(", nonlinear_predictors_gam, ")"), collapse = " + ") else NULL
linear_terms <- if (length(linear_predictors_gam) > 0) paste(linear_predictors_gam, collapse = " + ") else NULL
gam_formula_str <- paste(target_variable_name, "~", paste(c(smooth_terms, linear_terms), collapse = " + "))
# Remove potential trailing/leading "+" if one group was empty
gam_formula_str <- gsub("\\+ $", "", gsub("^\\+ ", "", gsub(" \\+ \\+ ", " + ", gam_formula_str)))
gam_formula <- as.formula(gam_formula_str)
print(paste("Using GAM formula:", gam_formula_str))## [1] "Using GAM formula: koi_pdisposition ~ s(koi_depth) + s(koi_teq) + s(koi_insol) + s(koi_slogg) + s(koi_srad) + s(koi_smass) + s(koi_impact) + s(koi_incl) + s(koi_dor) + s(koi_smet) + s(koi_period) + koi_duration + koi_prad + koi_model_snr + koi_steff + koi_ror + koi_srho + koi_ldm_coeff1"
Fit the GAM model.
gam_model <- gam(gam_formula,
data = train_data_scaled,
family = binomial(link = "logit"),
method = "REML"
)
summary(gam_model)##
## Family: binomial
## Link function: logit
##
## Formula:
## koi_pdisposition ~ s(koi_depth) + s(koi_teq) + s(koi_insol) +
## s(koi_slogg) + s(koi_srad) + s(koi_smass) + s(koi_impact) +
## s(koi_incl) + s(koi_dor) + s(koi_smet) + s(koi_period) +
## koi_duration + koi_prad + koi_model_snr + koi_steff + koi_ror +
## koi_srho + koi_ldm_coeff1
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.94561 0.11063 8.548 < 2e-16 ***
## koi_duration 1.73728 0.14344 12.112 < 2e-16 ***
## koi_prad -0.28360 0.29322 -0.967 0.33345
## koi_model_snr -0.13978 0.08423 -1.659 0.09702 .
## koi_steff 0.01356 0.17555 0.077 0.93843
## koi_ror 6.64085 0.79541 8.349 < 2e-16 ***
## koi_srho -0.00168 0.03094 -0.054 0.95669
## koi_ldm_coeff1 0.20498 0.07090 2.891 0.00384 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(koi_depth) 2.721 3.404 6.681 0.0983 .
## s(koi_teq) 5.173 6.144 309.050 <2e-16 ***
## s(koi_insol) 1.000 1.000 0.004 0.9485
## s(koi_slogg) 4.254 5.476 53.962 <2e-16 ***
## s(koi_srad) 6.971 7.765 3.574 0.8206
## s(koi_smass) 1.002 1.004 0.513 0.4749
## s(koi_impact) 1.817 2.077 14.228 0.0305 *
## s(koi_incl) 2.289 2.865 3.743 0.3121
## s(koi_dor) 4.856 6.004 96.825 <2e-16 ***
## s(koi_smet) 5.843 6.826 97.286 <2e-16 ***
## s(koi_period) 2.696 3.438 4.490 0.2376
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.544 Deviance explained = 46.3%
## -REML = 2458.4 Scale est. = 1 n = 6396
Predict on the test set and evaluate.
gam_probs <- predict(gam_model, newdata = test_data_scaled, type = "response")
gam_probs <- pmax(pmin(gam_probs, 1 - 1e-15), 1e-15)
gam_preds <- factor(ifelse(gam_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))Confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 697 167
## false_positive 109 626
##
## Accuracy : 0.8274
## 95% CI : (0.808, 0.8456)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6546
##
## Mcnemar's Test P-Value : 0.0006014
##
## Sensitivity : 0.7894
## Specificity : 0.8648
## Pos Pred Value : 0.8517
## Neg Pred Value : 0.8067
## Prevalence : 0.4959
## Detection Rate : 0.3915
## Detection Prevalence : 0.4597
## Balanced Accuracy : 0.8271
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(cm_gam$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_gam <- roc(response = test_target_factor, predictor = gam_probs, levels = levels(test_target_factor))
print(paste("GAM - AUC:", round(auc(roc_gam), 4)))## [1] "GAM - AUC: 0.893"
Plot ROC curve
plot(roc_gam,
main = "ROC Curve - GAM",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Let’s try adding interaction terms to the original GLM model.
interaction_subset <- c("koi_teq", "koi_slogg", "koi_ror", "koi_smet", "koi_impact")
# Ensure these exist in the data
interaction_subset <- interaction_subset[interaction_subset %in% names(train_data_scaled)]
# Get all other predictors to include as main effects
all_predictors <- setdiff(names(train_data_scaled), target_variable_name)
other_predictors <- setdiff(all_predictors, interaction_subset)Prepare the formula for the GLM model with interaction terms.
interaction_formula_str <- paste(
target_variable_name, "~",
paste0("(", paste(interaction_subset, collapse = " + "), ")^2"), # Pairwise interactions + main effects
"+",
paste(other_predictors, collapse = " + ")
) # Add main effects of others
interaction_formula <- as.formula(interaction_formula_str)
print(paste("Using interaction formula:", interaction_formula_str))## [1] "Using interaction formula: koi_pdisposition ~ (koi_teq + koi_slogg + koi_ror + koi_smet + koi_impact)^2 + koi_period + koi_duration + koi_depth + koi_prad + koi_insol + koi_model_snr + koi_steff + koi_srad + koi_smass + koi_srho + koi_incl + koi_dor + koi_ldm_coeff1"
Fit the GLM model.
glm_interaction_model <- glm(interaction_formula,
data = train_data_scaled,
family = binomial(link = "logit")
)
summary(glm_interaction_model)##
## Call:
## glm(formula = interaction_formula, family = binomial(link = "logit"),
## data = train_data_scaled)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.837e+14 9.182e+05 962444900 <2e-16 ***
## koi_teq 1.056e+15 1.991e+06 530666658 <2e-16 ***
## koi_slogg 4.326e+14 1.843e+06 234735460 <2e-16 ***
## koi_ror 1.465e+15 4.019e+06 364412212 <2e-16 ***
## koi_smet -3.529e+14 1.036e+06 -340529745 <2e-16 ***
## koi_impact 9.351e+13 2.577e+06 36282262 <2e-16 ***
## koi_period 2.376e+14 1.521e+06 156148218 <2e-16 ***
## koi_duration 4.758e+14 1.162e+06 409363536 <2e-16 ***
## koi_depth 1.864e+14 1.361e+06 137001244 <2e-16 ***
## koi_prad 2.407e+14 2.072e+06 116192510 <2e-16 ***
## koi_insol 3.147e+13 2.523e+06 12472902 <2e-16 ***
## koi_model_snr -4.602e+13 1.076e+06 -42785900 <2e-16 ***
## koi_steff -1.351e+14 1.779e+06 -75962347 <2e-16 ***
## koi_srad 1.993e+14 1.613e+06 123600517 <2e-16 ***
## koi_smass 2.297e+14 1.987e+06 115628797 <2e-16 ***
## koi_srho 3.575e+13 9.337e+05 38291236 <2e-16 ***
## koi_incl 4.230e+13 1.600e+06 26439186 <2e-16 ***
## koi_dor 1.143e+14 1.539e+06 74218505 <2e-16 ***
## koi_ldm_coeff1 1.398e+14 1.364e+06 102506756 <2e-16 ***
## koi_teq:koi_slogg 1.687e+14 5.822e+05 289846990 <2e-16 ***
## koi_teq:koi_ror -8.763e+14 2.987e+06 -293351502 <2e-16 ***
## koi_teq:koi_smet 2.522e+13 9.968e+05 25295452 <2e-16 ***
## koi_teq:koi_impact -3.046e+14 2.613e+06 -116592077 <2e-16 ***
## koi_slogg:koi_ror -3.951e+14 3.320e+06 -119029491 <2e-16 ***
## koi_slogg:koi_smet 1.152e+14 9.377e+05 122906001 <2e-16 ***
## koi_slogg:koi_impact -4.026e+13 2.231e+06 -18047890 <2e-16 ***
## koi_ror:koi_smet -4.314e+13 1.748e+06 -24684677 <2e-16 ***
## koi_ror:koi_impact -7.049e+13 1.159e+05 -608474447 <2e-16 ***
## koi_smet:koi_impact -1.282e+14 1.778e+06 -72140653 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8866.2 on 6395 degrees of freedom
## Residual deviance: 106184.6 on 6367 degrees of freedom
## AIC: 106243
##
## Number of Fisher Scoring iterations: 25
Predict on the test set and evaluate.
interaction_probs <- predict(glm_interaction_model, newdata = test_data_scaled, type = "response")
interaction_probs <- pmax(pmin(interaction_probs, 1 - 1e-15), 1e-15)
interaction_preds <- factor(ifelse(interaction_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))Confusion matrix
cm_interaction <- confusionMatrix(interaction_preds, test_target_factor, positive = positive_class)
cm_interaction## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 486 80
## false_positive 320 713
##
## Accuracy : 0.7498
## 95% CI : (0.7279, 0.7709)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5009
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8991
## Specificity : 0.6030
## Pos Pred Value : 0.6902
## Neg Pred Value : 0.8587
## Prevalence : 0.4959
## Detection Rate : 0.4459
## Detection Prevalence : 0.6460
## Balanced Accuracy : 0.7510
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(cm_interaction$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_glm_interaction <- roc(response = test_target_factor, predictor = interaction_probs, levels = levels(test_target_factor))
print(paste("GLM with interaction - AUC:", round(auc(roc_glm_interaction), 4)))## [1] "GLM with interaction - AUC: 0.751"
Plot ROC curve
plot(roc_glm_interaction,
main = "ROC Curve - GLM with interaction",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)Preapare dataset and formula
train_formula_lasso <- as.formula(paste("~ .")) # Include all predictors
x_train <- model.matrix(train_formula_lasso, data = train_data_scaled[, all_predictors])[, -1] # Predictor matrix, remove intercept
y_train <- train_data_scaled[[target_variable_name]] # Target factorbest_lambda <- cv_lasso_fit$lambda.min
x_test <- model.matrix(train_formula_lasso, data = test_data_scaled[, all_predictors])[, -1]
# Evaluate using the best lambda found by CV
lasso_probs <- predict(cv_lasso_fit, newx = x_test, s = best_lambda, type = "response")
lasso_probs <- as.vector(pmax(pmin(lasso_probs, 1 - 1e-15), 1e-15)) # Ensure stability
lasso_preds <- factor(ifelse(lasso_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))Confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 722 222
## false_positive 84 571
##
## Accuracy : 0.8086
## 95% CI : (0.7885, 0.8276)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6167
##
## Mcnemar's Test P-Value : 4.811e-15
##
## Sensitivity : 0.7201
## Specificity : 0.8958
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.7648
## Prevalence : 0.4959
## Detection Rate : 0.3571
## Detection Prevalence : 0.4096
## Balanced Accuracy : 0.8079
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(lasso_cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC curve
roc_lasso <- roc(response = test_target_factor, predictor = lasso_probs, levels = levels(test_target_factor))
print(paste("Lasso - AUC:", round(auc(roc_lasso), 4)))## [1] "Lasso - AUC: 0.8816"
Plot ROC curve
plot(roc_lasso,
main = "ROC Curve - Lasso",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)best_lambda_ridge <- cv_ridge_fit$lambda.min
ridge_probs <- predict(cv_ridge_fit, newx = x_test, s = best_lambda_ridge, type = "response")
ridge_probs <- as.vector(pmax(pmin(ridge_probs, 1 - 1e-15), 1e-15)) # Ensure stability
ridge_preds <- factor(ifelse(ridge_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))Confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction candidate false_positive
## candidate 717 214
## false_positive 89 579
##
## Accuracy : 0.8105
## 95% CI : (0.7904, 0.8294)
## No Information Rate : 0.5041
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6205
##
## Mcnemar's Test P-Value : 1.051e-12
##
## Sensitivity : 0.7301
## Specificity : 0.8896
## Pos Pred Value : 0.8668
## Neg Pred Value : 0.7701
## Prevalence : 0.4959
## Detection Rate : 0.3621
## Detection Prevalence : 0.4178
## Balanced Accuracy : 0.8099
##
## 'Positive' Class : false_positive
##
Plot confusion matrix
cm_data <- as.data.frame(lasso_cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
# Use a color scheme less prone to issues for colorblindness if possible
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
theme_minimal(base_size = 12) + # Set base font size
labs(
title = "Confusion Matrix Heatmap", # Corrected title
x = "Actual Class",
y = "Predicted Class"
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)ROC Curve
roc_ridge <- roc(response = test_target_factor, predictor = ridge_probs, levels = levels(test_target_factor))
print(paste("Ridge Regression - AUC:", round(auc(roc_ridge), 4)))## [1] "Ridge Regression - AUC: 0.8828"
Plot ROC curve
plot(roc_ridge,
main = "ROC Curve - Lasso",
col = "blue",
lwd = 2,
print.auc = TRUE,
print.auc.y = 0.75,
print.auc.x = 0.75
)# Create data frame with model performance metrics
models_performance <- data.frame(
Model = c("GLM", "GAM", "GLM with Interactions", "Lasso", "Ridge"),
Accuracy = c(
cm$overall["Accuracy"],
cm_gam$overall["Accuracy"],
cm_interaction$overall["Accuracy"],
lasso_cm$overall["Accuracy"],
ridge_cm$overall["Accuracy"]
),
Sensitivity = c(
cm$byClass["Sensitivity"],
cm_gam$byClass["Sensitivity"],
cm_interaction$byClass["Sensitivity"],
lasso_cm$byClass["Sensitivity"],
ridge_cm$byClass["Sensitivity"]
),
Specificity = c(
cm$byClass["Specificity"],
cm_gam$byClass["Specificity"],
cm_interaction$byClass["Specificity"],
lasso_cm$byClass["Specificity"],
ridge_cm$byClass["Specificity"]
),
AUC = c(
auc(roc_original),
auc(roc_gam),
auc(roc_glm_interaction),
auc(roc_lasso),
auc(roc_ridge)
)
)
# Round numeric columns to 4 decimal places
models_performance[, 2:5] <- round(models_performance[, 2:5], 4)
# save to Rda
save(models_performance, file = "data/Rdas/models_performance.Rda")
# Display the results
models_performance